# =============================================================================
# MASTER REPLICATION SCRIPT
# Title   : [The Cross and Conflict: How do Christians Impact Protest Dynamics?]
# Authors : [Joel Day]
# Journal : [Journal for the Scientific Study of Religion]
# Date    : 2026
#
# Description:
#   Single-file replication script for all models, figures, and tables.
#   Runs sequentially from data loading through final output. All outputs
#   are written to /output. Model fits are cached to avoid re-estimation
#   on repeat runs.
#
# Usage:
#   1. Place data at: data/updatedCNdataA.xlsx
#   2. Set working directory to this file's location
#   3. Run: source("mvprobit_replication_v2.R")
#
# R version: 4.3+ recommended
# =============================================================================


# =============================================================================
# 0. SETUP
# =============================================================================

suppressPackageStartupMessages({
  library(dplyr)
  library(tidyr)
  library(ggplot2)
  library(readxl)
  library(mvProbit)
  library(patchwork)
  library(officer)
  library(flextable)
})

# --- Paths -------------------------------------------------------------------
DATA_PATH  <- "data/updatedCNdataA.xlsx"
OUT_DIR    <- "output"
FIG_DIR    <- file.path(OUT_DIR, "figures")
TAB_DIR    <- file.path(OUT_DIR, "tables")
FITS_CACHE <- file.path(OUT_DIR, "fits.rds")

for (d in c(OUT_DIR, FIG_DIR, TAB_DIR)) {
  dir.create(d, showWarnings = FALSE, recursive = TRUE)
}

# --- Global options ----------------------------------------------------------
set.seed(123)
options(scipen = 999)

# --- Shared plot theme -------------------------------------------------------
theme_publication <- function(base_size = 11) {
  theme_bw(base_size = base_size) +
    theme(
      strip.background  = element_blank(),
      strip.text        = element_text(face = "bold", size = base_size),
      panel.grid.minor  = element_blank(),
      panel.grid.major  = element_line(color = "grey90"),
      axis.title        = element_text(size = base_size),
      axis.text         = element_text(size = base_size - 1),
      plot.caption      = element_text(hjust = 0, size = 8, color = "grey40"),
      plot.title        = element_text(face = "bold", hjust = 0.5,
                                       size = base_size + 1),
      legend.position   = "bottom"
    )
}

cat("=== Setup complete ===\n\n")


# =============================================================================
# 1. LOAD & CLEAN DATA
# =============================================================================

ANALYSIS_VARS <- c(
  # Outcomes
  "contention", "militiaarmed",
  # Christian identity predictors
  "ChristianKeywords", "ChristianRight", "ChristianLeft",
  "Practices", "ChristianLocation",
  # Controls
  "Racial", "COVID", "abortion", "LGBTQ", "conspiracy", "labor"
)

raw <- read_excel(DATA_PATH, col_types = "text")

dat <- raw %>%
  mutate(across(
    all_of(ANALYSIS_VARS),
    ~ as.numeric(trimws(as.character(.x)))
  )) %>%
  filter(if_all(all_of(ANALYSIS_VARS), ~ !is.na(.x)))

# Reduced sample for Models 1-6 (model comparison only; not used for inference)
dat_reduced <- dat[sample(nrow(dat), 8000), ]

cat(sprintf(
  "Data loaded -- Full sample: n = %s | Reduced sample (M1-M6): n = %s\n\n",
  format(nrow(dat), big.mark = ","),
  format(nrow(dat_reduced), big.mark = ",")
))


# =============================================================================
# 2. MODEL SPECIFICATIONS
# =============================================================================
#
# Nested structure: Models 1-5 each introduce one Christian identity measure
# in isolation with controls. Model 6 combines Right/Left. Model 7 is the
# full specification and is the basis for all statistical inference.

MODEL_SPECS <- list(
  M1 = c("ChristianKeywords"),
  M2 = c("ChristianRight", "ChristianLeft"),
  M3 = c("ChristianKeywords",
         "Racial", "COVID", "abortion", "LGBTQ", "conspiracy", "labor"),
  M4 = c("Practices",
         "Racial", "COVID", "abortion", "LGBTQ", "conspiracy", "labor"),
  M5 = c("ChristianLocation",
         "Racial", "COVID", "abortion", "LGBTQ", "conspiracy", "labor"),
  M6 = c("ChristianRight", "ChristianLeft",
         "Racial", "COVID", "abortion", "LGBTQ", "conspiracy", "labor"),
  M7 = c("ChristianRight", "ChristianLeft", "Practices", "ChristianLocation",
         "Racial", "COVID", "abortion", "LGBTQ", "conspiracy", "labor")
)

MODEL_DATA <- c(
  setNames(rep(list("reduced"), 6), paste0("M", 1:6)),
  list(M7 = "full")
)


# =============================================================================
# 3. ESTIMATE MODELS
# =============================================================================

fit_mvprobit <- function(data, predictors, nGHK, iterlim, finalHessian) {
  fml <- as.formula(
    paste0("cbind(contention, militiaarmed) ~ ",
           paste(predictors, collapse = " + "))
  )
  mvProbit(
    formula      = fml,
    data         = data,
    nGHK         = nGHK,
    iterlim      = iterlim,
    finalHessian = finalHessian,
    method       = "BHHH"
  )
}

# Use cached fits if available; re-estimate otherwise
if (file.exists(FITS_CACHE)) {

  cat("Loading cached model fits from:", FITS_CACHE, "\n\n")
  fits <- readRDS(FITS_CACHE)

} else {

  fits <- list()

  # Models 1-6: reduced sample, coarse simulation
  for (m in paste0("M", 1:6)) {
    t0 <- proc.time()
    cat("Estimating", m, "...")
    fits[[m]] <- fit_mvprobit(
      data         = dat_reduced,
      predictors   = MODEL_SPECS[[m]],
      nGHK         = 5,
      iterlim      = 40,
      finalHessian = "none"
    )
    elapsed <- round((proc.time() - t0)[["elapsed"]], 1)
    cat(sprintf(" done [%ss]\n", elapsed))
  }

  # Model 7: full sample, full precision (inference model)
  cat("Estimating M7 (full precision, full sample) ...")
  t0 <- proc.time()
  fits[["M7"]] <- fit_mvprobit(
    data         = dat,
    predictors   = MODEL_SPECS[["M7"]],
    nGHK         = 50,
    iterlim      = 300,
    finalHessian = "BHHH"
  )
  elapsed <- round((proc.time() - t0)[["elapsed"]], 1)
  cat(sprintf(" done [%ss]\n\n", elapsed))

  saveRDS(fits, FITS_CACHE)
  cat("Model fits cached to:", FITS_CACHE, "\n\n")
}


# =============================================================================
# 4. EXTRACT COEFFICIENTS & FIT STATISTICS
# =============================================================================

extract_coefs <- function(fit, data, predictors, model_label) {

  X <- model.matrix(
    as.formula(paste("~", paste(predictors, collapse = "+"))),
    data = data
  )
  k          <- ncol(X)
  n_outcomes <- 2
  beta       <- coef(fit)[seq_len(k * n_outcomes)]

  # Standard errors from Hessian (only available for M7)
  se_vec <- tryCatch(
    sqrt(diag(solve(-fit$hessian)))[seq_len(k * n_outcomes)],
    error = function(e) rep(NA_real_, k * n_outcomes)
  )

  data.frame(
    Model       = model_label,
    Outcome     = rep(c("Contention", "MilitiaArmed"), each = k),
    Predictor   = rep(colnames(X), times = n_outcomes),
    Coefficient = round(beta, 3),
    SE          = round(se_vec, 3),
    stringsAsFactors = FALSE
  ) %>%
    filter(Predictor != "(Intercept)")
}

coef_table <- bind_rows(
  extract_coefs(fits$M1, dat_reduced, MODEL_SPECS$M1, "M1"),
  extract_coefs(fits$M2, dat_reduced, MODEL_SPECS$M2, "M2"),
  extract_coefs(fits$M3, dat_reduced, MODEL_SPECS$M3, "M3"),
  extract_coefs(fits$M4, dat_reduced, MODEL_SPECS$M4, "M4"),
  extract_coefs(fits$M5, dat_reduced, MODEL_SPECS$M5, "M5"),
  extract_coefs(fits$M6, dat_reduced, MODEL_SPECS$M6, "M6"),
  extract_coefs(fits$M7, dat,         MODEL_SPECS$M7, "M7")
)

write.csv(coef_table,
          file.path(TAB_DIR, "mvprobit_nested_coefficients.csv"),
          row.names = FALSE)

# --- Fit statistics ----------------------------------------------------------
fit_stats <- bind_rows(lapply(names(fits), function(m) {
  fit  <- fits[[m]]
  data.frame(
    Model      = m,
    LogLik     = round(as.numeric(logLik(fit)), 2),
    N          = ifelse(m == "M7", nrow(dat), nrow(dat_reduced)),
    Predictors = length(MODEL_SPECS[[m]]),
    Converged  = isTRUE(fit$code == 1),
    stringsAsFactors = FALSE
  )
}))

write.csv(fit_stats,
          file.path(TAB_DIR, "mvprobit_nested_fitstats.csv"),
          row.names = FALSE)

cat("Coefficients and fit statistics saved to output/tables/\n\n")


# =============================================================================
# 5. PREDICTED PROBABILITIES -- MODEL 7
# =============================================================================
#
# For each Christian identity predictor, vary it from 0 to 1 while holding
# all other predictors at their sample means.

FOCAL_PREDS <- c("ChristianRight", "ChristianLeft", "Practices",
                 "ChristianLocation", "ChristianKeywords")

# Base grid: all predictors at their means
base_means <- setNames(
  lapply(MODEL_SPECS$M7, function(v) mean(dat[[v]])),
  MODEL_SPECS$M7
)

compute_probs <- function(focal_var, n_points = 50) {

  grid <- as.data.frame(lapply(base_means, rep, n_points))
  grid[[focal_var]] <- seq(0, 1, length.out = n_points)

  probs <- mvProbitExp(
    formula = ~ .,
    coef    = coef(fits$M7),
    data    = grid,
    nGHK    = 2000
  )

  as.data.frame(probs) %>%
    rename(Contention = V1, MilitiaArmed = V2) %>%
    mutate(
      FocalVar = focal_var,
      FocalVal = grid[[focal_var]]
    ) %>%
    pivot_longer(
      cols      = c(Contention, MilitiaArmed),
      names_to  = "Outcome",
      values_to = "Probability"
    )
}

prob_all <- bind_rows(lapply(
  intersect(FOCAL_PREDS, MODEL_SPECS$M7),
  compute_probs
))

prob_all$Outcome <- recode(prob_all$Outcome,
  MilitiaArmed = "Militia / Armed Actors"
)

write.csv(prob_all,
          file.path(TAB_DIR, "mvprobit_model7_probabilities.csv"),
          row.names = FALSE)

cat("Predicted probabilities saved to output/tables/\n\n")


# =============================================================================
# 6. FIGURES
# =============================================================================

# --- 6a. Predicted Probabilities (stacked facet) -----------------------------

fig_probs <- ggplot(
  prob_all,
  aes(x = FocalVal, y = Probability)
) +
  geom_line(linewidth = 0.75) +
  facet_grid(Outcome ~ FocalVar, scales = "free_y") +
  scale_x_continuous(breaks = c(0, 0.5, 1), labels = c("0", "0.5", "1")) +
  scale_y_continuous(limits = c(0, NA)) +
  labs(
    x       = "Focal Predictor Value",
    y       = "Predicted Probability",
    caption = paste(
      "Note: Each panel varies one predictor from 0 to 1 while holding",
      "all others at their sample means.\nEstimates from Model 7",
      "(full sample, full precision)."
    )
  ) +
  theme_publication()

ggsave(file.path(FIG_DIR, "Fig_Model7_PredictedProbabilities_Stacked.png"),
       fig_probs, width = 10, height = 5, dpi = 600)
ggsave(file.path(FIG_DIR, "Fig_Model7_PredictedProbabilities_Stacked.pdf"),
       fig_probs, width = 10, height = 5)

cat("Predicted probability figure saved.\n")


# --- 6b. Sign Table Figures --------------------------------------------------
#
# Summary sign tables showing direction and significance across nested models.
# Symbols: + / - with stars for significance; . = not significant; blank = excluded.

# Predictor ordering for sign table rows (lowercase to match actual variable names)
PREDICTOR_ORDER <- c(
  "ChristianKeywords", "ChristianRight", "ChristianLeft", "Practices",
  "ChristianLocation", "Racial", "COVID", "abortion", "LGBTQ",
  "conspiracy", "labor"
)

# Display-friendly labels for the y-axis in sign table plots
PREDICTOR_LABELS <- c(
  "ChristianKeywords" = "ChristianKeywords",
  "ChristianRight"    = "ChristianRight",
  "ChristianLeft"     = "ChristianLeft",
  "Practices"         = "Practices",
  "ChristianLocation" = "ChristianLocation",
  "Racial"            = "Racial",
  "COVID"             = "COVID",
  "abortion"          = "Abortion",
  "LGBTQ"             = "LGBTQ",
  "conspiracy"        = "Conspiracy",
  "labor"             = "Labor"
)

# Contention sign table data
# FIX #1: Each tribble row is now on its own line
sign_contention <- tribble(
  ~Predictor,          ~Model, ~Symbol,
  "ChristianKeywords", "M1",   "+**",
  "ChristianKeywords", "M3",   "+*",
  "ChristianRight",    "M6",   "+*",
  "ChristianRight",    "M7",   "+***",
  "ChristianLeft",     "M2",   "\u2212***",
  "ChristianLeft",     "M6",   "\u2212***",
  "ChristianLeft",     "M7",   "\u2212***",
  "Practices",         "M4",   ".",
  "Practices",         "M7",   "\u2212*",
  "ChristianLocation", "M5",   ".",
  "Racial",            "M3",   ".",
  "Racial",            "M4",   ".",
  "Racial",            "M5",   ".",
  "Racial",            "M6",   ".",
  "Racial",            "M7",   "\u2212**",
  "COVID",             "M3",   ".",
  "COVID",             "M4",   ".",
  "COVID",             "M5",   ".",
  "COVID",             "M6",   ".",
  "COVID",             "M7",   "\u2212**",
  "abortion",          "M3",   "\u2212**",
  "abortion",          "M4",   "\u2212**",
  "abortion",          "M5",   "\u2212*",
  "abortion",          "M6",   "\u2212***",
  "abortion",          "M7",   "\u2212***",
  "LGBTQ",             "M3",   ".",
  "LGBTQ",             "M4",   ".",
  "LGBTQ",             "M5",   ".",
  "LGBTQ",             "M6",   ".",
  "LGBTQ",             "M7",   "\u2212*",
  "conspiracy",        "M7",   "+**",
  "labor",             "M3",   "\u2212***",
  "labor",             "M4",   "\u2212***",
  "labor",             "M5",   "\u2212***",
  "labor",             "M6",   "\u2212***",
  "labor",             "M7",   "\u2212***"
)

# MilitiaArmed sign table data
sign_militia <- tribble(
  ~Predictor,          ~Model, ~Symbol,
  "ChristianKeywords", "M1",   "+*",
  "ChristianKeywords", "M3",   "+*",
  "ChristianRight",    "M2",   "+***",
  "ChristianRight",    "M6",   "+***",
  "ChristianRight",    "M7",   "+***",
  "ChristianLeft",     "M2",   ".",
  "ChristianLeft",     "M6",   ".",
  "ChristianLeft",     "M7",   "\u2212***",
  "Practices",         "M4",   "+***",
  "Practices",         "M7",   "+**",
  "ChristianLocation", "M5",   ".",
  "ChristianLocation", "M7",   "\u2212*",
  "Racial",            "M3",   ".",
  "Racial",            "M4",   ".",
  "Racial",            "M5",   ".",
  "Racial",            "M6",   ".",
  "Racial",            "M7",   "\u2212*",
  "COVID",             "M3",   ".",
  "COVID",             "M4",   ".",
  "COVID",             "M5",   ".",
  "COVID",             "M6",   ".",
  "COVID",             "M7",   "\u2212*",
  "abortion",          "M3",   "\u2212*",
  "abortion",          "M4",   "\u2212**",
  "abortion",          "M5",   "\u2212*",
  "abortion",          "M6",   "\u2212***",
  "abortion",          "M7",   "\u2212***",
  "LGBTQ",             "M3",   ".",
  "LGBTQ",             "M4",   ".",
  "LGBTQ",             "M5",   ".",
  "LGBTQ",             "M6",   ".",
  "LGBTQ",             "M7",   ".",
  "conspiracy",        "M3",   "+**",
  "conspiracy",        "M4",   "+*",
  "conspiracy",        "M5",   "+**",
  "conspiracy",        "M6",   "+*",
  "conspiracy",        "M7",   "+***",
  "labor",             "M3",   "\u2212***",
  "labor",             "M4",   "\u2212***",
  "labor",             "M5",   "\u2212***",
  "labor",             "M6",   "\u2212***",
  "labor",             "M7",   "\u2212***"
)

plot_sign_table <- function(df, title) {

  model_order <- paste0("M", 1:7)

  grid <- expand_grid(
    Predictor = PREDICTOR_ORDER,
    Model     = model_order
  )

  df2 <- grid %>%
    left_join(df, by = c("Predictor", "Model")) %>%
    mutate(
      Symbol    = replace_na(Symbol, ""),
      Stars     = gsub("^[+\u2212.]", "", Symbol),
      Sign      = gsub("\\*+", "", Symbol),
      Stars     = ifelse(Sign == ".", "", Stars),
      # Use display-friendly labels for the y-axis
      PredLabel = PREDICTOR_LABELS[Predictor],
      PredLabel = factor(PredLabel, levels = rev(PREDICTOR_LABELS)),
      Model     = factor(Model, levels = model_order),
      txt_col   = ifelse(as.character(Model) == "M7", "black", "grey60")
    )

  ggplot(df2, aes(x = Model, y = PredLabel)) +
    geom_text(aes(label = Sign, color = txt_col),
              size = 5.0, fontface = "bold", show.legend = FALSE) +
    geom_text(aes(label = Stars, color = txt_col),
              nudge_x = 0.16, size = 3.2, fontface = "bold",
              show.legend = FALSE) +
    scale_color_identity() +
    scale_x_discrete(position = "top") +
    labs(
      title   = title,
      caption = ". = not statistically significant;  blank = predictor not included in model"
    ) +
    theme_minimal(base_size = 11) +
    theme(
      panel.grid       = element_blank(),
      axis.title       = element_blank(),
      axis.text.x.top  = element_text(face = "bold"),
      axis.text.x.bottom = element_blank(),
      axis.ticks       = element_blank(),
      plot.title       = element_text(face = "bold", hjust = 0.5),
      plot.caption     = element_text(hjust = 0, size = 8, color = "grey40"),
      plot.margin      = margin(10, 15, 20, 15)
    )
}

p_contention <- plot_sign_table(
  sign_contention,
  "Coefficient Comparison Across Nested MVProbit Models (Contention)"
)

p_militia <- plot_sign_table(
  sign_militia,
  "Coefficient Comparison Across Nested MVProbit Models (Militia / Armed Actors)"
)

for (obj in list(
  list(p_contention, "Fig_SignTable_Contention"),
  list(p_militia,    "Fig_SignTable_MilitiaArmed")
)) {
  ggsave(file.path(FIG_DIR, paste0(obj[[2]], ".png")),
         obj[[1]], width = 9.5, height = 5.0, dpi = 600)
  ggsave(file.path(FIG_DIR, paste0(obj[[2]], ".pdf")),
         obj[[1]], width = 9.5, height = 5.0)
}

cat("Sign table figures saved.\n\n")


# =============================================================================
# 7. APPENDIX TABLES (WORD)
# =============================================================================
#
# Produces formatted coefficient tables for the appendix using flextable.
# One table per outcome, all models side by side.

# Require officer fp_border (defined before use in build_appendix_table)
fp_border <- officer::fp_border

build_appendix_table <- function(outcome_label, caption_text) {

  wide <- coef_table %>%
    filter(Outcome == outcome_label) %>%
    mutate(
      Cell = case_when(
        !is.na(SE) ~ sprintf("%.3f (%.3f)", Coefficient, SE),
        TRUE       ~ sprintf("%.3f (---)", Coefficient)
      )
    ) %>%
    select(Predictor, Model, Cell) %>%
    pivot_wider(names_from = Model, values_from = Cell) %>%
    # Use PREDICTOR_ORDER for row ordering (matches lowercase variable names
    # from coef_table, which come from colnames(model.matrix(...)))
    arrange(match(Predictor, PREDICTOR_ORDER))

  # Ensure all model columns present
  for (m in paste0("M", 1:7)) {
    if (!m %in% names(wide)) wide[[m]] <- ""
  }

  wide <- wide %>% select(Predictor, paste0("M", 1:7))

  ft <- flextable(wide) %>%
    set_caption(caption_text) %>%
    bold(part = "header") %>%
    fontsize(size = 10, part = "all") %>%
    font(fontname = "Times New Roman", part = "all") %>%
    align(align = "center", part = "all") %>%
    align(j = 1, align = "left", part = "all") %>%
    hline_top(part = "header", border = fp_border(width = 1.5)) %>%
    hline_bottom(part = "header", border = fp_border(width = 1)) %>%
    hline_bottom(part = "body",   border = fp_border(width = 1.5)) %>%
    autofit()

  ft
}

ft_contention <- build_appendix_table(
  "Contention",
  "Table A1. MVProbit Coefficients: Dependent Variable = Contention"
)

ft_militia <- build_appendix_table(
  "MilitiaArmed",
  "Table A2. MVProbit Coefficients: Dependent Variable = Militia / Armed Actors"
)

# Note block
note_text <- paste(
  "Standard errors in parentheses.",
  "Models 1-6 estimated on random subsample (n = 8,000) without Hessian.",
  "Model 7 estimated on full sample with BHHH Hessian; basis for inference.",
  "Blank cells indicate predictor not included in that model.",
  sep = " "
)

# Write to Word
doc <- read_docx() %>%
  body_add_par("Appendix Tables", style = "heading 1") %>%
  body_add_par("") %>%
  body_add_flextable(ft_contention) %>%
  body_add_par("") %>%
  body_add_par(note_text, style = "Normal") %>%
  body_add_break() %>%
  body_add_flextable(ft_militia) %>%
  body_add_par("") %>%
  body_add_par(note_text, style = "Normal")

print(doc, target = file.path(TAB_DIR, "Appendix_Tables_A1_A2.docx"))

# Also save as CSV
write.csv(
  coef_table %>% filter(Outcome == "Contention"),
  file.path(TAB_DIR, "Appendix_Table_A1.csv"),
  row.names = FALSE
)
write.csv(
  coef_table %>% filter(Outcome == "MilitiaArmed"),
  file.path(TAB_DIR, "Appendix_Table_A2.csv"),
  row.names = FALSE
)

cat("Appendix tables saved to output/tables/\n\n")


# =============================================================================
# 8. SESSION INFO
# =============================================================================

sink(file.path(OUT_DIR, "session_info.txt"))
cat("Replication session info\n")
cat("Generated:", format(Sys.time(), "%Y-%m-%d %H:%M:%S"), "\n\n")
print(sessionInfo())
sink()

cat("Session info saved to output/session_info.txt\n\n")


# =============================================================================
# DONE
# =============================================================================

cat(paste(rep("=", 60), collapse = ""), "\n")
cat("REPLICATION COMPLETE\n")
cat(paste(rep("=", 60), collapse = ""), "\n")
cat(sprintf("Outputs written to: %s/\n", normalizePath(OUT_DIR)))
cat("\nFigures:\n")
for (f in list.files(FIG_DIR)) cat(" ", f, "\n")
cat("\nTables:\n")
for (f in list.files(TAB_DIR)) cat(" ", f, "\n")
